Frist Step: Plot the distribution of resolution time

df=read.csv("~/Desktop/CU 2016 Spr/DV CU/Final_proj/final_proj/311_15.csv",header = TRUE)
date_df = df[order(as.Date(df$Closed.Date, format="%m/%d/%Y"),decreasing=TRUE),]
df_open=df[df$Status=="Open",]
df$date_diff <- as.Date(as.character(df$Closed.Date),format="%m/%d/%Y")-as.Date(as.character(df$Created.Date), format="%m/%d/%Y")
df_hl_diff = df[df$Complaint.Type=='Homeless Encampment',1:54]

get_bc_map = function(df) {
  lon = df$Longitude
  lat = df$Latitude
  bc_bbox <- make_bbox(lat = lat, lon = lon)
  bc_big <- get_map(location = bc_bbox, source = "google", maptype = "terrain")
  return (bc_big)
}

bc_map = get_bc_map(df_hl_diff)
## converting bounding box to center/zoom specification. (experimental)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=40.710281,-73.983187&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
df_hl_diff$date_diff = as.numeric(df_hl_diff$date_diff)
ggmap(bc_map)+geom_point(data=df_hl_diff, mapping = aes(x=Longitude,y=Latitude,color = factor(df_hl_diff$date_diff)))+labs(title="Homeless Encampment Issue Solving Time Distribution",x="Longtitude",y="Latitude",colour = "Length of Time")

Second Step: Choose features for predictive model.

# df_hl_diff is a df only for HE issue
#levels(df_hl_diff$Agency.Name) #"Internal Affairs Bureau", "New York City PolicDepartment", "NYPD"
# levels(df_hl_diff$Borough)
# levels(df_hl_diff$Descriptor) # 46 features
# levels(df_hl_diff$Location.Type) # 19 remove "" remember
df_pred = df_hl_diff[,c("Address.Type","Borough","Location.Type","date_diff")]
df_pred = df_pred[df_pred$Location.Type!= "",]

Thrid Step: Building Model. I executed random forest algorithm to predict the resolution time given 32 dummy variables.

df_dummies = dummyVars(~.,data = df_pred)
df_data = as.data.frame(predict(df_dummies, newdata = df_pred)) # dummy df 
cut_off_index = nrow(df_data)%/%4
test_data = na.omit(df_data[1:cut_off_index,])
train_data = na.omit(df_data[(nrow(test_data)+1):nrow(df_data),])
names(train_data) <- gsub(" ", ".",names(train_data)) # substitute space and /
names(train_data) <- gsub("/", ".",names(train_data))
names(test_data) <- gsub(" ", ".",names(test_data)) # substitute space and /
names(test_data) <- gsub("/", ".",names(test_data))

rf_clf = randomForest(as.factor(date_diff)~.,data=train_data)
print (rf_clf)
## 
## Call:
##  randomForest(formula = as.factor(date_diff) ~ ., data = train_data) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 12.26%
## Confusion matrix:
##      0 1 2 3 4 class.error
## 0 5196 0 0 0 0           0
## 1  712 0 0 0 0           1
## 2    5 0 0 0 0           1
## 3    8 0 0 0 0           1
## 4    1 0 0 0 0           1
rf_clf$importance
##                                          MeanDecreaseGini
## Address.Type.                                  0.25043847
## Address.Type.ADDRESS                           0.87935544
## Address.Type.BLOCKFACE                         0.72448637
## Address.Type.INTERSECTION                      0.96615151
## Address.Type.LATLONG                           0.00000000
## Address.Type.PLACENAME                         0.52634256
## Borough.BRONX                                  3.34634234
## Borough.BROOKLYN                               1.15454622
## Borough.MANHATTAN                              4.40276785
## Borough.QUEENS                                 0.76353095
## Borough.STATEN.ISLAND                          1.05288555
## Borough.Unspecified                            0.00000000
## Location.Type.                                 0.00000000
## Location.Type.Bridge                           0.16446193
## Location.Type.Club.Bar.Restaurant              0.00000000
## Location.Type.Commercial                       0.00000000
## Location.Type.Ferry                            0.00000000
## Location.Type.Highway                          0.25948062
## Location.Type.House.and.Store                  0.00000000
## Location.Type.House.of.Worship                 0.00000000
## Location.Type.Park                             0.00000000
## Location.Type.Park.Playground                  0.96512999
## Location.Type.Parking.Lot                      0.00000000
## Location.Type.Residential.Building             0.00000000
## Location.Type.Residential.Building.House       1.91229758
## Location.Type.Roadway.Tunnel                   0.03390475
## Location.Type.Store.Commercial                 1.41663714
## Location.Type.Street.Sidewalk                  0.92909824
## Location.Type.Subway.Station                   0.00000000
## Location.Type.Terminal                         0.00000000
## Location.Type.Vacant.Lot                       0.00000000
pred_y = predict(rf_clf,test_data)

Conclusion:

The OOB estimate of error rate is 12.26%, which is unbiased for the test set with the same size as the trainning set. From the importance level, we can see that the factor ‘Borough’ has the biggest impact on the resolution time (Manhattan and Bronx have biggest MeanDecreaseGini). ‘Addresss type’ and ‘location type’ also have impact on resolution time especially the ‘ADDRESS’ type under ‘Addresss type’ and ‘Residential.Building.House’ type under ‘location type’ have high power on predicting our response variable (I also tested that Agency.Name have no impoact. It has 3 levels).

Lastly, I used the test dataset to compare the original graph and predicted graph.

df_test_orig = df_hl_diff[1:cut_off_index,]
df_test_pred = cbind(df_hl_diff[1:cut_off_index,][-ncol(df_hl_diff)],pred_y)
bc_map_test_orig <- get_bc_map(df_test_orig)
## converting bounding box to center/zoom specification. (experimental)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=40.70301,-73.981617&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
ggmap(bc_map_test_orig)+geom_point(data=df_test_orig, mapping = aes(x=Longitude,y=Latitude,color = factor(df_test_orig$date_diff)))+labs(title="Homeless Encampment Issue Solving Time Distribution",x="Longtitude",y="Latitude",colour = "Length of Time")

bc_map_test_pred<- get_bc_map(df_test_pred)
## converting bounding box to center/zoom specification. (experimental)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=40.70301,-73.981617&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
ggmap(bc_map_test_pred)+geom_point(data=df_test_pred, mapping = aes(x=Longitude,y=Latitude,color = factor(df_test_orig$date_diff)))+labs(title="Predicted Homeless Encampment Issue Solving Time Distribution",x="Longtitude",y="Latitude",colour = "Length of Time")